import pandas as pd
from sklearn.exceptions import UndefinedMetricWarning
import warnings
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import plotly.express as px
#data on github url
url_accident = "https://raw.githubusercontent.com/Sayalinale/RoadTraffic-Analysis/main/dft-road-casualty-statistics-accident-2021.csv"
url_casualty = "https://raw.githubusercontent.com/Sayalinale/RoadTraffic-Analysis/main/dft-road-casualty-statistics-casualty-2021.csv"
url_vehicle = "https://raw.githubusercontent.com/Sayalinale/RoadTraffic-Analysis/main/dft-road-casualty-statistics-vehicle-2021.csv"
# Load CSV files into Pandas DataFrames with low_memory=False
df_accident_data = pd.read_csv(url_accident, low_memory=False)
df_casualty_data = pd.read_csv(url_casualty, low_memory=False)
df_vehicle_data = pd.read_csv(url_vehicle, low_memory=False)
# Merge the DataFrames on the 'accident_reference' column
merged_data = pd.merge(df_accident_data, df_casualty_data, on='accident_reference')
merged_data = pd.merge(merged_data, df_vehicle_data, on='accident_reference')
# Now, 'merged_data' contains the merged data from all three DataFrames based on 'accident_reference'
print(merged_data.columns)
print(merged_data.shape)
Index(['accident_index_x', 'accident_year_x', 'accident_reference',
'location_easting_osgr', 'location_northing_osgr', 'longitude',
'latitude', 'police_force', 'accident_severity', 'number_of_vehicles',
'number_of_casualties', 'date', 'day_of_week', 'time',
'local_authority_district', 'local_authority_ons_district',
'local_authority_highway', 'first_road_class', 'first_road_number',
'road_type', 'speed_limit', 'junction_detail', 'junction_control',
'second_road_class', 'second_road_number',
'pedestrian_crossing_human_control',
'pedestrian_crossing_physical_facilities', 'light_conditions',
'weather_conditions', 'road_surface_conditions',
'special_conditions_at_site', 'carriageway_hazards',
'urban_or_rural_area', 'did_police_officer_attend_scene_of_accident',
'trunk_road_flag', 'trunk_road_flag.1', 'accident_index_y',
'accident_year_y', 'vehicle_reference_x', 'casualty_reference',
'casualty_class', 'sex_of_casualty', 'age_of_casualty',
'age_band_of_casualty', 'casualty_severity', 'pedestrian_location',
'pedestrian_movement', 'car_passenger', 'bus_or_coach_passenger',
'pedestrian_road_maintenance_worker', 'casualty_type',
'casualty_home_area_type', 'casualty_imd_decile', 'lsoa_of_casualty',
'accident_index', 'accident_year', 'vehicle_reference_y',
'vehicle_type', 'towing_and_articulation', 'vehicle_manoeuvre',
'vehicle_direction_from', 'vehicle_direction_to',
'vehicle_location_restricted_lane',
'vehicle_location_restricted_lane.1', 'skidding_and_overturning',
'hit_object_in_carriageway', 'vehicle_leaving_carriageway',
'hit_object_off_carriageway', 'first_point_of_impact',
'vehicle_left_hand_drive', 'journey_purpose_of_driver', 'sex_of_driver',
'age_of_driver', 'age_band_of_driver', 'engine_capacity_cc',
'propulsion_code', 'age_of_vehicle', 'generic_make_model',
'driver_imd_decile', 'driver_home_area_type', 'lsoa_of_driver',
'Unnamed: 28'],
dtype='object')
(246314, 82)
# Check for null values in the merged DataFrame
null_values = merged_data.isnull()
# Count the number of null values in each column
null_counts = null_values.sum()
# Print the null counts for all columns
for column, count in null_counts.items():
print(f"Column '{column}': {count} missing values")
Column 'accident_index_x': 0 missing values Column 'accident_year_x': 0 missing values Column 'accident_reference': 0 missing values Column 'location_easting_osgr': 43 missing values Column 'location_northing_osgr': 43 missing values Column 'longitude': 43 missing values Column 'latitude': 43 missing values Column 'police_force': 0 missing values Column 'accident_severity': 0 missing values Column 'number_of_vehicles': 0 missing values Column 'number_of_casualties': 0 missing values Column 'date': 0 missing values Column 'day_of_week': 0 missing values Column 'time': 0 missing values Column 'local_authority_district': 0 missing values Column 'local_authority_ons_district': 0 missing values Column 'local_authority_highway': 0 missing values Column 'first_road_class': 0 missing values Column 'first_road_number': 0 missing values Column 'road_type': 0 missing values Column 'speed_limit': 0 missing values Column 'junction_detail': 0 missing values Column 'junction_control': 0 missing values Column 'second_road_class': 0 missing values Column 'second_road_number': 0 missing values Column 'pedestrian_crossing_human_control': 0 missing values Column 'pedestrian_crossing_physical_facilities': 0 missing values Column 'light_conditions': 0 missing values Column 'weather_conditions': 0 missing values Column 'road_surface_conditions': 0 missing values Column 'special_conditions_at_site': 0 missing values Column 'carriageway_hazards': 0 missing values Column 'urban_or_rural_area': 0 missing values Column 'did_police_officer_attend_scene_of_accident': 0 missing values Column 'trunk_road_flag': 0 missing values Column 'trunk_road_flag.1': 0 missing values Column 'accident_index_y': 0 missing values Column 'accident_year_y': 0 missing values Column 'vehicle_reference_x': 0 missing values Column 'casualty_reference': 0 missing values Column 'casualty_class': 0 missing values Column 'sex_of_casualty': 0 missing values Column 'age_of_casualty': 0 missing values Column 'age_band_of_casualty': 0 missing values Column 'casualty_severity': 0 missing values Column 'pedestrian_location': 0 missing values Column 'pedestrian_movement': 0 missing values Column 'car_passenger': 0 missing values Column 'bus_or_coach_passenger': 0 missing values Column 'pedestrian_road_maintenance_worker': 0 missing values Column 'casualty_type': 0 missing values Column 'casualty_home_area_type': 0 missing values Column 'casualty_imd_decile': 0 missing values Column 'lsoa_of_casualty': 0 missing values Column 'accident_index': 0 missing values Column 'accident_year': 0 missing values Column 'vehicle_reference_y': 0 missing values Column 'vehicle_type': 0 missing values Column 'towing_and_articulation': 0 missing values Column 'vehicle_manoeuvre': 0 missing values Column 'vehicle_direction_from': 0 missing values Column 'vehicle_direction_to': 0 missing values Column 'vehicle_location_restricted_lane': 0 missing values Column 'vehicle_location_restricted_lane.1': 0 missing values Column 'skidding_and_overturning': 0 missing values Column 'hit_object_in_carriageway': 0 missing values Column 'vehicle_leaving_carriageway': 0 missing values Column 'hit_object_off_carriageway': 0 missing values Column 'first_point_of_impact': 0 missing values Column 'vehicle_left_hand_drive': 0 missing values Column 'journey_purpose_of_driver': 0 missing values Column 'sex_of_driver': 0 missing values Column 'age_of_driver': 0 missing values Column 'age_band_of_driver': 0 missing values Column 'engine_capacity_cc': 0 missing values Column 'propulsion_code': 0 missing values Column 'age_of_vehicle': 0 missing values Column 'generic_make_model': 0 missing values Column 'driver_imd_decile': 0 missing values Column 'driver_home_area_type': 0 missing values Column 'lsoa_of_driver': 0 missing values Column 'Unnamed: 28': 246314 missing values
# List of columns to keep
columns_to_keep = [
'accident_reference',
'location_easting_osgr',
'location_northing_osgr',
'longitude',
'latitude',
'police_force',
'accident_severity',
'number_of_vehicles',
'date',
'day_of_week',
'time',
'local_authority_district',
'first_road_class',
'first_road_number',
'road_type',
'speed_limit',
'junction_detail',
'junction_control',
'second_road_class',
'second_road_number',
'pedestrian_crossing_human_control',
'pedestrian_crossing_physical_facilities',
'light_conditions',
'weather_conditions',
'road_surface_conditions',
'special_conditions_at_site',
'carriageway_hazards',
'did_police_officer_attend_scene_of_accident',
'trunk_road_flag',
'trunk_road_flag.1',
'vehicle_reference_x',
'casualty_class',
'sex_of_casualty',
'age_of_casualty',
'age_band_of_casualty',
'casualty_severity',
'pedestrian_location',
'pedestrian_movement',
'car_passenger',
'bus_or_coach_passenger',
'pedestrian_road_maintenance_worker',
'casualty_type',
'casualty_home_area_type',
'casualty_imd_decile',
'vehicle_type',
'towing_and_articulation',
'vehicle_manoeuvre',
'vehicle_direction_from',
'vehicle_direction_to',
'vehicle_location_restricted_lane',
'vehicle_location_restricted_lane.1',
'skidding_and_overturning',
'hit_object_in_carriageway',
'vehicle_leaving_carriageway',
'hit_object_off_carriageway',
'first_point_of_impact',
'vehicle_left_hand_drive',
'journey_purpose_of_driver',
'sex_of_driver',
'age_of_driver',
'age_band_of_driver',
'engine_capacity_cc',
'propulsion_code',
'age_of_vehicle',
'driver_imd_decile',
'driver_home_area_type',
'lsoa_of_driver'
]
# Keep only the specified columns
merged_data = merged_data[columns_to_keep]
# Remove rows with null values
merged_data = merged_data.dropna()
merged_data.shape
(246271, 67)
import pandas as pd
import plotly.express as px
fig = px.density_mapbox(merged_data,
lat='latitude',
lon='longitude',
radius=8,
center=dict(lat=merged_data['latitude'].mean(), lon=merged_data['longitude'].mean()),
zoom=6,
mapbox_style='open-street-map',
color_continuous_scale = 'rainbow',
title='Spatial Heatmap of Accidents')
fig.show()
# Downsample the data to reduce memory usage (adjust the fraction as needed)
downsampled_data = merged_data.sample(frac=0.1, random_state=42)
# Selecting relevant features for clustering
features_for_clustering = downsampled_data[['road_type']]
# Standardize the features
scaler = StandardScaler()
features_for_clustering_scaled = scaler.fit_transform(features_for_clustering)
# Apply DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=5)
clusters = dbscan.fit_predict(features_for_clustering_scaled)
# Add the clusters to the DataFrame
downsampled_data['cluster'] = clusters
# Plot the scatter map with clusters
fig = px.scatter_mapbox(downsampled_data,
lat='latitude',
lon='longitude',
color='cluster',
center=dict(lat=downsampled_data['latitude'].mean(), lon=downsampled_data['longitude'].mean()),
zoom=10,
mapbox_style='open-street-map',
title='DBSCAN Clusters on Map with Road Type')
fig.show()
# Downsample the data to reduce memory usage (adjust the fraction as needed)
downsampled_data = merged_data.sample(frac=0.1, random_state=42)
# Selecting relevant features for clustering
features_for_clustering = downsampled_data[['weather_conditions']]
# Standardize the features
scaler = StandardScaler()
features_for_clustering_scaled = scaler.fit_transform(features_for_clustering)
# Apply DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=5)
clusters = dbscan.fit_predict(features_for_clustering_scaled)
# Add the clusters to the DataFrame
downsampled_data['cluster'] = clusters
# Plot the scatter map with clusters
fig = px.scatter_mapbox(downsampled_data,
lat='latitude',
lon='longitude',
color='cluster',
center=dict(lat=downsampled_data['latitude'].mean(), lon=downsampled_data['longitude'].mean()),
zoom=10,
mapbox_style='open-street-map',
title='DBSCAN Clusters on Map with Road Type')
fig.show()
# Downsample the data to reduce memory usage (adjust the fraction as needed)
downsampled_data = merged_data.sample(frac=0.1, random_state=42)
# Selecting relevant features for clustering
features_for_clustering = downsampled_data[['road_surface_conditions']]
# Standardize the features
scaler = StandardScaler()
features_for_clustering_scaled = scaler.fit_transform(features_for_clustering)
# Apply DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=3)
clusters = dbscan.fit_predict(features_for_clustering_scaled)
# Add the clusters to the DataFrame
downsampled_data['cluster'] = clusters
# Plot the scatter map with clusters
fig = px.scatter_mapbox(downsampled_data,
lat='latitude',
lon='longitude',
color='cluster',
center=dict(lat=downsampled_data['latitude'].mean(), lon=downsampled_data['longitude'].mean()),
zoom=6,
mapbox_style='open-street-map',
title='DBSCAN Clusters on Map with Road Surface Conditions')
fig.show()